%
% File naaclhlt2009.tex
%
% Contact: nasmith@cs.cmu.edu

\documentclass[11pt]{article}
\usepackage{naaclhlt2009}
\usepackage{times}
\usepackage{latexsym}
\usepackage{epsfig}
\setlength\titlebox{6.5cm}    % Expanding the titlebox

\newcommand{\abs}[1]{\left\vert#1\right\vert}
\newcommand{\set}[1]{\left\{#1\right\}}
\newcommand{\citet}[1]{\cite{#1}}
\newcommand{\citep}[1]{\cite{#1}}
\newcommand{\ut}{\textit}
\newcommand{\lit}[1]{\footnote{Lit. translation: #1}}

\newcommand{\efgr}[2]{
  \begin{figure}[htbp]
    \makebox[8.5cm]{\framebox[5cm]{\rule{0pt}{5cm}}}
    \caption{#2}
    \label{#1}
  \end{figure}
}

\newcommand{\fgrparam}[4]{
  \begin{figure}[htbp]
    \begin{center}
      \leavevmode
      \includegraphics[#1]{#2}
    \end{center}
    \caption{#4}
    \label{#3}
  \end{figure}
}

\title{Transformation-based Learning for Semantic parsing}

\author{F. Jur\v{c}\'{i}\v{c}ek, F. Mairesse, M. Ga\v{s}i\'{c}, S. Keizer, B. Thomson, K. Yu, and S. Young \\
\\
Engineering Department \\
Cambridge University \\
Trumpington Street, Cambridge, CB2 1PZ, UK \\
\\
{\tt \{fj228, farm2, mg436, sk561, brmt2, ky219, sjy\}@eng.cam.ac.uk}
}

\date{}

\begin{document}
\maketitle
\begin{abstract}
In this paper, we present a semantic parser which transforms initial naive semantic hypothesis into correct semantics by using an ordered set of rules. These rules are learned automatically from the training corpus with no prior linguistic knowledge.

We show that TBL parser is competitive to the state-of-the-art semantic parser on the ATIS task without using any handcrafted linguistic knowledge.

\end{abstract}

\textbf{Check how you call STEP/TBL parser!}

\textbf{Should I define semantic parsing? Parsing at all??}

\textbf{Check how you use input, output, hypothesis semantics!}


\section{Introduction}
Semantic parsing is important part of a spoken dialogue system \cite{williams07}. The goal of a semantic parsing is to map natural language to formal meaning representation - semantics. Such semantics can be either defined by a grammar, e.g. LR grammar for GeoQuery domain \cite{kate05} or by frames and slots e.g. TownInfo domain \cite{thomson08}. In Table \ref{tbl:sem:example}, it is shown an example of the frame and slot semantics from ATIS dataset. Each frame has a goal and set of slots. Each slot is composed of a slot name, e.g. ``from.city'', a slot equal sign, e.g. ``='' or ``!='', and slot value, e.g. ``Washington''. As dialogue managers commonly use semantics in the form frames and slots, our approach learns how to directly map from natural language into frame and slot semantics.

% The semantics, which is designed by a domain expert, is directly executable by a question answering system \cite{wong06} or by a dialogue manager in a spoken dialogue system.

\begin{table}
\begin{center}
\begin{small}\begin{tabular}{|lll|} 
  \hline
  \multicolumn{3}{l}{``what are the lowest airfare from Washington DC} \\
  \multicolumn{3}{l}{to Salt Lake City''} \\
  \hline
  GOAL          & = & airfare \\
  airfare.type  & = & lowest \\
  from.city     & = & Washington \\
  from.state    & = & DC \\
  to.city       & = & Salt Lake City \\
  \hline
\end{tabular} 
\end{small}\end{center}
\caption{Example of frame and slot semantics from ATIS \cite{atis94} dataset.}
\label{tbl:sem:example}
\end{table}

A dialogue system needs a semantic parser which is accurate and robust, easy to build, and fast. First, a parser presented in this paper performs comparable to the state-of-the-art semantics parser and it can deal with ill formed utterances. Second, it does not need any handcrafted linguistic knowledge and it can learn from data which is not semantically annotated at the word-level. Finally, it learns a compact set of rules which results in limited number of operation per semantic concept.

% can be understood as machine translation from a natural language to a formal language. First, we do not not have formal grammar for natural language is ungrammatical, include hesitations, and very often only fragments of complete sentences, e.g. ``Boston to Miami tomorrow''. 

In our approach, we adapt transformation-based learning (TBL) \cite{brill95} to the problem of semantic parsing. We attempt to find an ordered list of transformation rules to improve a naive semantic annotation. A transformation parsing assumes:
\begin{itemize}
\item Initial annotation is obtained by a naive annotator.
\item Transformations are applied in sequence. 
\item Transformations progressively change from general to specific.
\item Results of previous transformations are visible to following transformations.
\item Transformations correct errors of the previous transformations.
\end{itemize} 

In the next section, we describe previous work on mapping natural language into formal meaning representation. Section \ref{sec:tbl} presents an example of TBL based semantic parsing, discuss templates for transformation rules, and describes the learning process. Section \ref{sec:evaluation} compares TBL parser to the previously developed semantic parsers on ATIS \cite{atis94} and TownInfo \cite{mairesse09} datasets.

\section{Related work}

In Section \ref{sec:evaluation}, we compare performance of our method with the four existing systems that were evaluated on the same dataset we consider. 

First, hidden Vector State model \cite{he06} have been used to model an approximation of a pushdown automaton which has semantic concepts as non-terminal symbols. Consequently, a deterministic algorithm was used to recover slot values.

Second, automatic induction of combinatory categorical grammar \cite{zettlemoyer07} have been used to map sentences to lambda-calculus. The combinatory categorical grammar is generalized into probabilistic model by learning log-linear model. An online learning algorithm update weights of features representing a parse tree of an input sentence. They show that their technique produces the state-of-the-art performance on the Air Travel Information (ATIS) dataset \cite{atis94}. \textbf{However, }apart from using the the lexical categories (city names, airport names, etc) readily available from the ATIS corpus, they also need considerable number of handcrafted entries in their initial lexicon. 

Third, Markov logic networks \cite{meza08a,meza08b} have been used to extract slot values by applying ideas of a Markov network to first-order logic. In this approach, weights are attached to first-order clauses which represent relationship between slot names and its values. Such weighted clauses are used as templates for features of Markov networks.

Finally, support vector machines \cite{mairesse09} have been used to build a semantic trees by recursive calling classifiers to estimate probabilities of production rules using a linear kernel and word based features.

There has been also done a large amount of research into mapping natural language to semantics that is not directly comparable because it uses either different corpora or different meaning representation. 

Wong \cite{wong06} used machine translation techniques with a syntax-based translation model based on the synchronous context-free grammars. 
Inductive logic programing \cite{tang01} have been used to incrementally develop a theory including a set of predicates. In each iteration, the predicates were generalized from predicates in the theory and predicates automatically constructed from examples. 

Transformation techniques \cite{kate05} have been used to sequentially rewrite an utterance into semantics. However, our approach differ in the way how the semantics is constructed. Instead of rewriting an utterance, we transform initial naive semantic hypothesis. As a result, we can use input words several times to trigger transformations of the semantics. This extends our ability to handle non-compositionality phenomena in spoken language.

Kate \cite{kate08} used support vector machines and tree kernels to integrate knowledge contained in the gold standard dependency trees to capture long-range relationship between words. 

\section{Transformation-based parsing} \label{sec:tbl}
This section describes the transformation-based parser. First of all, we give an example of the parsing algorithm. Secondly, we describe  templates used to generate rules for the inference process. Finally, we detail the learning process. 

\subsection{Example of Parsing} \label{sec:tbl:example}
The semantic parser transforms initial naive semantic hypothesis into correct semantics by applying transformations from an ordered set of rules. Each rule is composed of a trigger and a transformation and a trigger initiates a transformation of a hypothesis.

The parsing consists of three steps: 
\begin{enumerate}
  \item initial semantics is assigned as hypothesis
  \item sequentially apply all rules\footnote{Input utterance is not modified by rules. As a result, words from the utterance can be trigger several different transformations.}
  \item output hypothesis semantics
\end{enumerate}

We demonstrate the parsing on an example. Think of the utterance: \textit{``find all the flights between Toronto and San Diego that arrive on Saturday''} 

First, the goal ``flight'' is used as the initial goal because it is the most common goal in the ATIS dataset and no slots are added in the semantics. As a result, the initial semantics is as follows:

\vspace{.25cm}
\begin{tabular}{lll}
  GOAL & = & flight
\end{tabular} 
\vspace{.25cm}

Secondly, the rules whose triggers match the sentence and the hypothesis are sequentially applied. Generally rules add slots, delete slots, and substitute slots. However, in this example the matching rules are only those which add slots.

\vspace{.25cm}
\begin{tabular}{ll}
  trigger & transformation \\
  \hline 
  ``between               & add slot \\
    toronto and''         &``from.city=Toronto'' \\
  ``and san diego''       & add slot \\
                          & ``to.city=San Diego'' \\
  ``saturday''            & add slot \\
                          & ``departure.day=Saturday'' \\
\end{tabular} 
\vspace{.25cm}

As a result of the transformations, we obtain the following semantic hypothesis. 

\vspace{.25cm}
\begin{tabular}{lll}
  GOAL          & = & flight \\
  from.city     & = & Toronto \\
  to.city       & = & San Diego \\
  departure.day & = & Saturday \\
\end{tabular} 
\vspace{.25cm}

The trigger ``and Sand Diego'' is example of non-compositionality, in which the words in an utterance do not have a one-to-one correspondence with the slots in the semantics. The word ``and'' indicates that the city ``San Diego'' is slot value of the slot ``to.city''. 

As the TBL method tends to learn and apply general rules first, the parser learns to associate the date and time values with the ``departure*'' slots because the date and time values are mostly associated with slots describing properties of departure in the ATIS dataset. The incorrect classification of the word ``Saturday'' is a result of such generalization. 

However, TBL method learns to correct its errors. Therefore, the parser also applies at later stage of parsing the error correcting rules. For example, the following rule corrects the slot name of the slot value ``Saturday''.

\vspace{.25cm}
\begin{tabular}{ll}
  trigger & transformation \\
  \hline 
  ``arrive''            & substitute slot from\\
                        & ``departure.day=*'' to \\
                        & ``arrive.day=*'' \\
\end{tabular} 
\vspace{.25cm}

In this case, we substitute the slot name with the correct values. The final semantic hypothesis is as follows.

\vspace{.25cm}
\begin{tabular}{lll}
  GOAL          & = & flight \\
  from.city     & = & Toronto \\
  to.city       & = & San Diego \\
  saturday.day & = & Saturday \\
\end{tabular} 
\vspace{.25cm}

\subsection{Slot alignment}

So far we considered an utterance as a bag of words and no notion of locality was considered. For example, before we perform the substitution of the slot ``departure.day'' to ``arrival.day'', we should test whether word ``arrive'' is in the vicinity of the slot. The reason is that we do not want to trigger the substitution of the slot ``from.city=Toronto'' to ``to.city=Toronto'' because the parser can also learn the rule as follows. 

\vspace{.25cm}
\begin{tabular}{ll}
  trigger & transformation \\
  \hline 
  ``arrive''            & substitute slot from\\
                        & ``from.city=*'' to \\
                        & ``to.city=*'' \\
\end{tabular} 
\vspace{.25cm}
\fgrparam{width=8cm}{./fig/words-slots-alignment.pdf}{fig:alignment}{Alignment between words and the slots in the example utterance.}

% \textbf{XXX: I mentioned delete transformation but never defined.}

One way to deal with this problem is to constrain triggers performing substitution to be activated only if they are in the vicinity of the slot lexical realization. We track the words from the utterance, which were used in triggers. Every time we apply a transformation of a slot, we store links between the words which triggered the transformation and the target slot. Such links are called as direct alignment. 

% For the delete transformation no tracking information is kept because a slot is removed from hypothesis and never used again.

In Figure \ref{fig:alignment} (a), we see the alignment between the words and the slots in the example utterance after applying the first set of rules. The full lines denote direct alignment created by the add-transformations. The dashed lines denote derived alignment - alignment computed from the direct alignment. Because no rules were triggered by words ``find all the flights'' and ``that arrive on'' those words could not be aligned directly to any of the slots, we have to derive such alignment. To compute derived alignment, first we order the slots so that the slot aligned with the left-most word is the first and the ordering results in minimum instances of direct alignment crossing. Than, every unaligned word is aligned with the nearest left and the right slot. In Figure\ref{fig:alignment} (a), the phrase ``find all the flights'' can be aligned to the slot ``from.city=Toronto'' only. The phrase ``that arrive on'' can be aligned to two slots ``to.city=San Diego'' and ``departure.day=Saturday''.

In Figure \ref{fig:alignment} (b), we see the alignment after applying the substitution. We can see change in the alignment of the phrase ``that arrive on''. First, the word ``arrive'' is aligned to the slot ``arrive.day=Saturday''. Second, the word ``on'' must be aligned to the same slot as the word ``arrive'',  otherwise the derived alignment would cross the direct alignment. There is no change in the alignment of the word ``that''.

\subsection{Rule templates}
In the previous section, we presented several rules which add, substitute, or delete slots. These rules were selected by a learning algorithm from a large set of all potential rules. Such rules are generated from a set of templates for triggers and transformations. The templates define what types of rules can be used for parsing. 

A trigger controls when a transformation of a semantic hypothesis can be performed and it can question an input utterance, an output semantics, or both. In our method, a trigger contains one or more conditions as follows.
\begin{itemize}
  \item The utterance contains n-gram N.
  \item The utterance contains skipping\footnote{In a skipping bigram, one or more words are skipped between two words. For example, ('arrive',*,'Boston') is a skipping bigram which skips one word.} bigram B.
  \item The goal equals to G.
  \item The semantics contains slot S.
\end{itemize}
If a trigger contains more than one condition, then all conditions must be satisfied. A transformation performs one of these operations:
\begin{itemize}
  \item Substitutes a goal to G.
  \item Adds a slot S.
  \item Deletes a slot S
  \item Substitutes a slot S.
\end{itemize}
A substitution transformation can either substitute a whole slot, a slot name, a slot equal sign, or a slot value.

\subsection{Learning} \label{sec:tbl:learning}
The main idea of transformation-based learning is to learn an ordered list of rules which incrementally improve initially assigned naive semantic hypothesis (see the algorithm in Figure \ref{alg:tbl:learning}) and the initial assignment is made based on simple statistics. The semantics with the most common goal slots are added is used as initial semantics. The learning is conducted in a greedy fashion and at each step TBL chooses the transformation rule that reduces the largest number of errors in our hypothesis. Number of errors includes number of goal substitutions, number of slot insertions, number of slot deletions, number of slot substitutions. The learning ends when no rule that improves the hypothesis beyond the pre-set threshold can be found.

To limit overfitting the training data, we prune some rules which are learned at the end of the learning. We sequentially apply each rule on the development set and we measure the number of errors. At the end, we chose the N first rules for which the parser gets the lowest number of errors.

\begin{figure}
\begin{small}
\textsc{
\begin{enumerate}
  \item initial semantics is assigned as hypothesis to each utterance
  \item repeat as long as the number of errors on the training set decreases
  \begin{enumerate}
    \item generate all possible rules which correct at least one error in the training set
    \item measure number of corrected errors by each rule
    \item select the rule with the largest number of corrected errors
    \item apply the selected rule to the current state of the training set.
    \item Stop if the number of corrected errors is smaller then threshold T.
  \end{enumerate}
  \item prune rules
\end{enumerate}}
\end{small}
\caption{Rule learning algorithm.}
\label{alg:tbl:learning}
\end{figure} 

As in the previous work \cite{zettlemoyer07,mairesse09,meza08b}, we make use of database with lexical realizations of some slots, for example city and airport names. The use of such database results in more robust parser because the number of possible slot values for each slot is usually very high. In the input utterance, we replace lexical realizations of slot values with category labels, e.g. ``i want to fly from CITY''. Similarly, we replace slot values in the semantics.

To speed up the training process, we select multiple best performing rules and the performance of worst selected rule has to be at least at least 80\% of the best rule. We found that selection of multiple rules during learning does not affect the performance of the parser and at the same time it decrease the learning time.

% During the decoding phase, the test set is initialized with the same initial class's assignment. Each rule is than applied, in the order it was learned, to the test set. The final classification is the one attained when all rules have been applied.

\section{Evaluation} \label{sec:evaluation}

In this section, we evaluate our parser on two distinct corpora, and compare our results with the state-of-the-art techniques and handcrafted rule-based parser. 

\subsection{Datasets}

In order to compare our results with previous work \cite{he06,zettlemoyer07,meza08b,mairesse09},
we apply our method to the Air Travel Information System dataset
(ATIS) \cite{atis94}. This dataset consists of user requests for flight information, for example ``find flight from San Diego to Phoenix on Monday''. We use 5012 utterances for training, and the DEC94 dataset as development data. As in previous work, we test our method on the 448 utterances of the NOV93 dataset, and the evaluation criteria is the F-measure of the number of reference slot/value pairs that appear in the output semantic (e.g., from.city = New York). He \& Young detail the test data extraction process in \cite{he05}.

Our second dataset consists of tourist information dialogues in a fictitious
town (TownInfo). The dialogues were collected through user
trials in which users searched for information about a specific venue
by interacting with a dialogue system in a noisy background. These
dialogues were previously used for training dialogue management
strategies \cite{williams07,thomson08}. 

For example, ``what is the address of Char Sue'' is represented as 

\vspace{.25cm}
\begin{tabular}{lll}
  GOAL          & = & request \\
  address       & = & Char Sue \\
\end{tabular} 
\vspace{.25cm}

and ``I would like a Chinese restaurant?'' as

\vspace{.25cm}
\begin{tabular}{lll}
  GOAL       & = & inform \\
  food       & = & Chinese \\
  type       & = & restaurant \\
\end{tabular} 
\vspace{.25cm}

The TownInfo training, development, and test sets respectively contain
8396, 986 and 1023 transcribed utterances.  The data includes the transcription of the top hypothesis of the ATK speech recogniser, which allows us to evaluate the robustness of our models to recognition
errors (word error rate = 34.4\%). 
We compare our model with STC praser \cite{mairesse09} and the handcrafted Phoenix grammar \cite{ward91} used in the trials \cite{williams07,thomson08}. The Phoenix parser implements a partial matching algorithm that was designed for robust spoken language understanding.

For both corpora are available databases with lexical entries for slot values e.g. city names, airport names, etc. 

\subsection{Improving disambiguation of long-range dependencies}

\fgrparam{width=7cm}{./fig/dep-tree.pdf}{fig:dep:tree}{Dependency tree of the sentence ''Show the cheapest flights from New York to San Jose arriving before 7pm on Monday`` generated by the RASP parser \cite{rasp06}.}

Besides simple n-grams and skipping bigrams more complex lexical features can be used. Kate \citep{kate08} used gold standard word dependencies to capture long-range relationship between words. At its simplest, dependency trees are one of the most concise way to describe language syntax. Essentially, each word is viewed as the dependent of one other word, with the exception of a single word which that is the root of the sentence. Kate showed that word dependencies significantly improve semantic parsing because long-range dependencies from an utterance tend to be local in a dependency tree. For example, the words ''arriving`` and ''Monday`` are neighbors in the dependency tree but they are four words apart in the sentence (see Figure \ref{fig:dep:tree}).

Instead of using gold standard word dependencies, we used dependencies provided by RASP dependency parser \cite{rasp06}. First of all, we had to add capitalization and punctuation into the ATIS data to be able to use the RASP parser. The RASP parser without proper capitalization fails to tag ''new`` and ''york`` as NP and instead of this it tags ''new`` as ''JJ`` and 'york' as NP and the dependencies generated by the parser are unsatisfactory. Secondly, we generated new n-gram features from dependency trees. Even though the dependencies generated the RASP parser are no absolutely accurate, the new features increase performance in F-measure on ATIS data. 

Secondly, we generated long-range features by using POS tags\footnote{We used POS tags provided by the RASP parser; however, any POS tagger can be used instead.}. 
Our motivation was work of \cite{meza08a,meza08b} who handcrafted features using words ''arrive``, ''arriving``, ''leave``, and ''leaving``. These handcrafted features disambiguate large number of semantic parsing errors in ATIS data because large portion or errors is caused by confusions between concepts ''arrival.time`` and ''departure.time``, ''arrival.day`` and ''departure.day``, etc. We generalized this approach and we automatically find features which  disambiguate semantics of words like ''Monday``, ''7pm``, and ''Boston``. As a result, we generate a new type of bigrams for a word and the nearest verb, preposition, etc. \textbf{We use all parts-of-speech provided by RASP and the learning algorithm chooses the most discriminative features. Among those learned are not only the words used by Meza-Rui but also words like ''stop``, ''reach``, ''buy`` and prepositions like ''at``, ''from``, ''to``, etc.} For example, for the nearest verb for the word ''Morning`` in the sentence from Figure \ref{fig:dep:tree} is ''arrive`` and such bigram would look be written as ('arrive',VV,'Monday') where VV stands for verb. This features assumes that the left-to-right tendency is dominant and the words in vicinity of lexical realization of a slot value affect the meaning the most.

\subsection{Results}

\begin{table}
\begin{center}
\begin{small}
\begin{tabular}{|l|ccc|}
\hline \makebox[2.99cm]{\bf Parser} & \makebox[1.1cm]{\bf Prec} & \makebox[1.1cm]{\bf Rec} & \bf F \\ \hline 
\multicolumn{4}{l}{\textbf{TownInfo dataset with transcribed utterances:}} \\
\hline
TBL      & 96.05 & 94.66 & 95.35 \\
STC      & 97.39 & 94.05 & \textbf{95.69} \\
Phoenix  & 96.33 & 94.22 & 95.26 \\
\hline
\multicolumn{4}{l}{\textbf{TownInfo dataset with ASR output:}} \\
\hline
TBL      & 92.72 & 83.42 & 87.82 \\
STC      & 94.03 & 83.73 & \textbf{88.58} \\
Phoenix & 90.28 & 79.49 & 84.54 \\
\hline
\multicolumn{4}{l}{\textbf{ATIS dataset with transcribed utterances:}} \\
\hline
TBL   & 96.37 & 95.12 & 95.74 \\
PCCG  & 95.11 & 96.71 & \textbf{95.9} \\
STC   & 96.73 & 92.37 & 94.50 \\
HVS   & - & - & 90.3  \\
MLN   & - & - & 92.99 \\
\hline
\end{tabular}
\end{small}
\end{center}
\caption{Slot/value precision (Prec), recall (Rec) and F-measure (F) for the ATIS and TownInfo datasets. TBL parser is compared with Phoenix parser and STC classifier \cite{mairesse09} on the TownInfo dataset and compared with HVS parser \cite{he06}, MLN parser \cite{meza08b}, STC classifier, and PCCG parser \cite{zettlemoyer07} on the ATIS dataset}
\label{tbl:results-final} 
\end{table}

The results for both detasets are shown in Table \ref{tbl:results-final}.
The model accuracy is measured in terms of F-measure of the slot/value pairs. Both slot and the value must be correct to count as correct classification. We report precision, recall, and F-mesure (harmonic mean of precision and recall).

Results on the AITS dataset show that our method with F-measure 95.74\% is competitive with rerspect to Zettlemoyer \& Collins' PCCG model \cite{zettlemoyer07} (F-mesure=95.9\%). Note that PCCG model makes use of considerable large number of handracfted entries in their initial lexicon. In addition, our method ouperforms STC, MLN and HVS parsers. 

Concerning the TownInfo dataset, Table \ref{tbl:results-final} shows that TBL produces 87.87\% F-mesure, wich represents a 3.28\%improvemnt over achieved by the handcrafted Phoenix parser, but 0.76\% lower compared with STC model.

% We believe that STC is performs better becasue it consider the STC classifier use all features at one time. STC makes decision in one step using all the features rather than making several decisions by several rules as STEP.
% We found that the dialogue act type recognition accuracy of the STEP parser is lower than STC's; as a result, we tried to use SVM as STC does to classify dialogue act types. 
% We hoped for an increase of F-measure as result of increased dialogue act type accuracy. However, we did not get any increase in F-measure.

\efgr{fig:learning:curve}{The learning curve shows the relation between number of learned rules and the F-measure for both TI and ATIS corpora.}

The number of learned rules is very small. As is shown in the figure \ref{fig:learning:curve}, learning curves for both training data and development data are very steep. Although our current strategy for choosing the final number of rules for decoding is to keep only the rules for which we obtain highest F-measure on the development data, we could use much less rules without scarifying accuracy. For example, we accepted 0.1\% lower F-measure on the development data than we would need only YYY rules in comparison with XXX rules if select the number of rules based in the highest F-measure. In contrast, the initial lexicon the CCG parser \cite{zettlemoyer07} contains about 180 complex entries for general English words or phrases and yet additional lexical entries must be learned. \textbf{explain better}

Also, the number of rules per semantic concept (dialogue act or slot name) is very low. In TI data, we have XXX different dialogue acts and XXX slot and the average number of rules per semantic concept is XXX. In case of ATIS data, we have XXX dialogue acts and XXX slots and the average number of rules per semantic concept is XXX.

Lexical realizations of a slot can overlap with lexical realization of neighbouring slots. It is shows to be important pattern, for example in the trigram (city-0,and,city-1) is very common for sentence including ''between city-0 and city-1``. The lexical realizations city-0, city-1 respectively would be classified as from.city, and city-1 just because we know the  


\textbf{Speed}

\section{Conclusion}

This paper presents novel application of TBL for semantic parsing. Our method learns sequence of rules which transforms initial naive semantics into correct semantics. It significantly differ from the method presented by Kate et al \cite{kate05} where they were rewriting utterance and replacing its words with semantic concepts. 

Our method was applied to two very different domains and it was shown that it performs competitive on both datasets with respect to the state-of-the-art semantic parsers. Results show that our method is comeptitive with respect to Zettlemoyer \& Collins' PCCG model \cite{zettlemoyer07} on ATIS dataset. In addition, our method outperforms STC, MLN and HVS by 1.27\%, 2.75\%, and 5.44\% respectively. We also show that our method outperforms the handcrafted Phoenix parser on ASR output and it is competitive with respect to STC on TownInfo dataset - TBL's performance is only 0.76\% lower compared with STC model.



\section*{Acknowledgments}

We would like to thank to Luke Zettlemoyer and I.V. Meza-Ruiz for valuable discussions about their methods.

\begin{thebibliography}{}

\bibitem[\protect\citename{Brill}1995]{brill95}
E. Brill.
\newblock 1995.
\newblock {\em Transformation-based error-driven learning
and natural language processing: A case study in part-of-speech
tagging}.
\newblock Computational Linguistics, 21(4):543?565.

\bibitem[\protect\citename{Briscoe et al}2006]{rasp06}
E. Briscoe, J. Carroll and R. Watson.
\newblock 2006.
\newblock {\em The Second Release of the RASP System}.
\newblock Proceedings of COLING/ACL.

\bibitem[\protect\citename{Dahl et al}1994]{atis94}
D.A. Dahl, M. Bates, M. Brown, W. Fisher, K. Hunicke-Smith,
D. Pallett, C. Pao, A. Rudnicky, and E. Shriberg.
\newblock 1994.
\newblock {\em Expanding the scope of the ATIS task: The ATIS-3 corpus}.
\newblock Proceedings of the ARPA HLT Workshop.

\bibitem[\protect\citename{He and Young}2005]{he05}
Y. He and S. Young.
\newblock 2005.
\newblock {\em Semantic processing using the
hidden vector state model}.
\newblock Computer Speech \& Language,vol. 19, no. 1, pp. 85-106.

\bibitem[\protect\citename{He and Young}2006]{he06}
Y. He and S. Young.
\newblock 2006.
\newblock {\em Spoken language understanding using the hidden vector state model}.
\newblock Computer Speech \& Language, vol. 19, no. 1, pp. 85-106.

\bibitem[\protect\citename{Kate}2005]{kate05}
R.J. Kate, Y.W. Wong and R.J. Mooney.
\newblock 2005.
\newblock {\em Learning to Transform Natural to Formal Languages}.
\newblock Proceedings of AAAI.

\bibitem[\protect\citename{Kate}2008]{kate08}
R.J. Kate.
\newblock 2008.
\newblock {\em A Dependency-based Word Subsequence Kernel}.
\newblock Proceedings of EMNLP.

\bibitem[\protect\citename{Mairesse et al}2009]{mairesse09}
F. Mairesse, M. Gasic, F. Jurcicek, S. Keizer, B. Thomson, K. Yu, and S. Young.
\newblock 2009.
\newblock {\em Spoken Language Understanding from Unaligned Data using Discriminative Classification Models}.
\newblock Proceedings of ICASSP.

\bibitem[\protect\citename{Meza et al}2008a]{meza08a}
I.V. Meza-Ruiz, S. Riedel and O. Lemon.
\newblock 2008.
\newblock {\em Accurate statistical spoken language understanding from limited development resources}.
\newblock Proceedings of ICASSP.

\bibitem[\protect\citename{Meza et al}2008b]{meza08b}
I.V. Meza-Ruiz, S. Riedel and O. Lemon.
\newblock 2008.
\newblock {\em Spoken Language Understanding in dialogue systems, using a 2-layer Markov Logic Network: improving semantic accuracy}.
\newblock Proceedings of Londial.

\bibitem[\protect\citename{Tang et al}2001]{tang01}
L.R. Tang and R. J. Mooney 
\newblock 2001.
\newblock {\em Using Multiple Clause Constructors in Inductive Logic Programming for Semantic Parsing}.
\newblock Proceedings of ECML.

\bibitem[\protect\citename{Thomson et al}2008]{thomson08}
B. Thomson, M. Gasic, S. Keizer, F. Mairesse, J. Schatzmann,
K. Yu, and S. Young.
\newblock 2008.
\newblock {\em User study of the Bayesian update of dialogue state approach to dialogue management}.
\newblock Proceedings of Interspeech.

\bibitem[\protect\citename{Ward}1991]{ward91}
W.H. Ward.
\newblock 1991.
\newblock {\em The Phoenix system: Understanding spontaneous
speech}.
\newblock Proceedings of ICASSP.

\bibitem[\protect\citename{Williams and Young}2007]{williams07}
J. Williams and S. Young.
\newblock 2007.
\newblock {\em Partially observable markov decision
processes for spoken dialog systems}.
\newblock Computer Speech and Language, vol. 21, no. 2, pp. 231-422.

\bibitem[\protect\citename{Wong and Mooney}2006]{wong06}
Y.W. Wong and R.J. Mooney.
\newblock 2006.
\newblock {\em Learning for Semantic Parsing with Statistical Machine Translation}.
\newblock Proceedings of HLT/NAACL.

\bibitem[\protect\citename{Zettlemoyer and Collins}2007]{zettlemoyer07}
L.S. Zettlemoyer and M. Collins.
\newblock 2005.
\newblock {\em Online learning of relaxed CCG grammars for parsing to logical form}.
\newblock Proceedings of EMNLP-CoNLL.

\end{thebibliography}

\end{document}
